load libraries

library(ggbiplot)
## Loading required package: ggplot2
## Loading required package: plyr
## Loading required package: scales
## Warning: package 'scales' was built under R version 3.3.2
## Loading required package: grid
library(nnet)
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.3.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(GGally)
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
library(randomForest)
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(plotROC)
library(knitr)

read data

total = read.csv('totale.csv', stringsAsFactors = F)

argenine = read.csv('argenine-totale.csv', stringsAsFactors = F)
total_noSeg = read.csv("total_data.csv", stringsAsFactors = F)
total_noSeg$argenine= argenine$argenine
total_noSeg$health2[total_noSeg$health2==1]= 'ILL'
total_noSeg$health2[total_noSeg$health2==2]= 'Healthy'
total_noSeg$recoddiseas2[is.na(total_noSeg$recoddiseas2)]= 0

{r} # hist((total_noSeg$aspartate- mean(total_noSeg$aspartate))/sqrt(var(total_noSeg$aspartate)), breaks = 20) # # shapiro.test((total_noSeg$aspartate- mean(total_noSeg$aspartate))/sqrt(var(total_noSeg$aspartate))) # # hist(total_noSeg$glutamate, breaks = 20) # # hist(total_noSeg$citrolline, breaks = 20) #

seperate data to ill and healthy

total_noSeg %>% filter(health2=='ILL')-> ill
## Warning: package 'bindrcpp' was built under R version 3.3.2
## Warning: `as_dictionary()` is soft-deprecated as of rlang 0.3.0.
## Please use `as_data_pronoun()` instead
## This warning is displayed once per session.
## Warning: `new_overscope()` is soft-deprecated as of rlang 0.2.0.
## Please use `new_data_mask()` instead
## This warning is displayed once per session.
## Warning: The `parent` argument of `new_data_mask()` is deprecated.
## The parent of the data mask is determined from either:
## 
##   * The `env` argument of `eval_tidy()`
##   * Quosure environments when applicable
## This warning is displayed once per session.
## Warning: `overscope_clean()` is soft-deprecated as of rlang 0.2.0.
## This warning is displayed once per session.
total_noSeg %>% filter(health2=='Healthy')-> healthy

pair-wise scatterplot

ggpairs(data=total_noSeg,             # data.frame with variables
           mapping = ggplot2::aes(colour = as.factor(health2)),
lower = list(continuous = wrap("smooth", method = "lm")),
             columns=6:8,           # columns to plot, default to all.
           title=("Vitiligo data")    # title of the plot
)

correlation of amino acids in ill and healthy groups

ggcorr(total_noSeg[, c(6:27)], palette = "RdBu", label = F)

PCA(without proportions)

pca = prcomp(total_noSeg[, 6:27], scale. = T )
eigv = round(pca$sdev^2/sum(pca$sdev^2)*100, 2)
eigv = data.frame(c(1:22),eigv)
names(eigv) = c('PCs','Variance')
# plot(eigv,type = "b",col = "darkblue",lwd = 2);grid()

PCA = pca$sdev^2
names(PCA) = paste0('PC', eigv$PCs)
qcc::pareto.chart(PCA)

##       
## Pareto chart analysis for PCA
##         Frequency Cum.Freq. Percentage Cum.Percent.
##   PC1  7.84117244  7.841172 35.6416929     35.64169
##   PC2  3.67095704 11.512129 16.6861684     52.32786
##   PC3  1.75973707 13.271867  7.9988048     60.32667
##   PC4  1.22317726 14.495044  5.5598966     65.88656
##   PC5  1.03993471 15.534979  4.7269760     70.61354
##   PC6  0.93403939 16.469018  4.2456336     74.85917
##   PC7  0.82609387 17.295112  3.7549721     78.61414
##   PC8  0.75455680 18.049669  3.4298037     82.04395
##   PC9  0.66718557 18.716854  3.0326617     85.07661
##   PC10 0.49948225 19.216336  2.2703739     87.34698
##   PC11 0.49321390 19.709550  2.2418814     89.58887
##   PC12 0.43796486 20.147515  1.9907494     91.57961
##   PC13 0.39855576 20.546071  1.8116171     93.39123
##   PC14 0.30454543 20.850616  1.3842974     94.77553
##   PC15 0.26097433 21.111591  1.1862470     95.96178
##   PC16 0.23876983 21.350361  1.0853174     97.04709
##   PC17 0.18794711 21.538308  0.8543050     97.90140
##   PC18 0.15869163 21.696999  0.7213256     98.62272
##   PC19 0.11975295 21.816752  0.5443316     99.16706
##   PC20 0.07603278 21.892785  0.3456035     99.51266
##   PC21 0.06150454 21.954290  0.2795661     99.79223
##   PC22 0.04571047 22.000000  0.2077749    100.00000
biplot(pca, cex= 0.5)

Classification using PC1 and PC2: health2

ggbiplot(pca, obs.scale = 1, var.scale = 1,
  groups = total_noSeg$health2, ellipse = TRUE, circle = TRUE) +
  scale_color_discrete(name = '') +
  theme(legend.direction = 'horizontal', legend.position = 'top')

Classification using PC1 and PC2: nonsegmentale

ggbiplot(pca, obs.scale = 1, var.scale = 1,
  groups = as.factor(total_noSeg$nonsegmentale), ellipse = TRUE, circle = TRUE) +
  scale_color_discrete(name = '') +
  theme(legend.direction = 'horizontal', legend.position = 'top')

PCA(with proportions)

pca = prcomp(total_noSeg[, 6:45], scale. = T )
eigv = round(pca$sdev^2/sum(pca$sdev^2)*100, 2)
eigv = data.frame(c(1:40),eigv)
names(eigv) = c('PCs','Variance')
# plot(eigv,type = "b",col = "darkblue",lwd = 2);grid()

PCA = pca$sdev^2
names(PCA) = paste0('PC', eigv$PCs)
qcc::pareto.chart(PCA)

##       
## Pareto chart analysis for PCA
##          Frequency Cum.Freq.   Percentage Cum.Percent.
##   PC1  8.730764084  8.730764 21.826910211     21.82691
##   PC2  5.968581762 14.699346 14.921454404     36.74836
##   PC3  3.925588279 18.624934  9.813970698     46.56234
##   PC4  3.088183995 21.713118  7.720459989     54.28280
##   PC5  2.760456722 24.473575  6.901141806     61.18394
##   PC6  2.371001651 26.844576  5.927504127     67.11144
##   PC7  1.652114370 28.496691  4.130285924     71.24173
##   PC8  1.431774832 29.928466  3.579437079     74.82116
##   PC9  1.245777010 31.174243  3.114442524     77.93561
##   PC10 1.025514306 32.199757  2.563785765     80.49939
##   PC11 0.956852902 33.156610  2.392132256     82.89152
##   PC12 0.871254883 34.027865  2.178137207     85.06966
##   PC13 0.704561125 34.732426  1.761402813     86.83106
##   PC14 0.674038678 35.406465  1.685096694     88.51616
##   PC15 0.639180716 36.045645  1.597951789     90.11411
##   PC16 0.529362576 36.575008  1.323406440     91.43752
##   PC17 0.481647839 37.056656  1.204119598     92.64164
##   PC18 0.411155933 37.467812  1.027889832     93.66953
##   PC19 0.376511206 37.844323  0.941278016     94.61081
##   PC20 0.337844446 38.182167  0.844611116     95.45542
##   PC21 0.300062556 38.482230  0.750156390     96.20557
##   PC22 0.254954516 38.737184  0.637386289     96.84296
##   PC23 0.247645273 38.984830  0.619113183     97.46207
##   PC24 0.176469708 39.161299  0.441174271     97.90325
##   PC25 0.161531791 39.322831  0.403829477     98.30708
##   PC26 0.122199299 39.445030  0.305498248     98.61258
##   PC27 0.106995231 39.552026  0.267488078     98.88006
##   PC28 0.094485092 39.646511  0.236212730     99.11628
##   PC29 0.077503553 39.724014  0.193758882     99.31004
##   PC30 0.065377214 39.789392  0.163443035     99.47348
##   PC31 0.046753862 39.836145  0.116884655     99.59036
##   PC32 0.042576305 39.878722  0.106440761     99.69680
##   PC33 0.038084729 39.916806  0.095211822     99.79202
##   PC34 0.032193511 39.949000  0.080483777     99.87250
##   PC35 0.018120596 39.967121  0.045301490     99.91780
##   PC36 0.012985963 39.980107  0.032464907     99.95027
##   PC37 0.007234250 39.987341  0.018085624     99.96835
##   PC38 0.005564460 39.992905  0.013911150     99.98226
##   PC39 0.004678498 39.997584  0.011696245     99.99396
##   PC40 0.002416279 40.000000  0.006040698    100.00000
biplot(pca, cex= 0.5)

Classification using PC1 and PC2: health2

ggbiplot(pca, obs.scale = 1, var.scale = 1,
  groups = total_noSeg$health2, ellipse = TRUE, circle = TRUE) +
  scale_color_discrete(name = '') +
  theme(legend.direction = 'horizontal', legend.position = 'top')

Classification using PC1 and PC2: nonsegmentale

ggbiplot(pca, obs.scale = 1, var.scale = 1,
  groups = as.factor(total_noSeg$nonsegmentale), ellipse = TRUE, circle = TRUE) +
  scale_color_discrete(name = '') +
  theme(legend.direction = 'horizontal', legend.position = 'top')

Mann-Whitney test on all the columns(both aminoacids and proportions) vs health2

ill_healthy_mw=as.data.frame(sapply(total_noSeg[, c(6:45)], function(x)
  wilcox.test(x~total_noSeg$health2)
   )   )


ill_healthy_mw = rbind(ill_healthy_mw ,  adjusted_p.value_hochberg = p.adjust(unlist(ill_healthy_mw[3,]), method = 'hochberg'))

ill_healthy_mw = rbind(ill_healthy_mw ,  adjusted_p.value_fdr = p.adjust(unlist(ill_healthy_mw[3,]), method = 'fdr'))

ill_healthy_mw = rbind(ill_healthy_mw, ill_mean =colMeans(total_noSeg[total_noSeg$health2=='ILL', 6:45]) )
ill_healthy_mw = rbind(ill_healthy_mw, healthy_mean =colMeans(total_noSeg[total_noSeg$health2=='Healthy', 6:45]) )

kable(ill_healthy_mw)
aspartate glutamate asparagine serine glutamine glycine theronine histithine citrolline alanine argenine tyrosine cysteine valine methionine tryptophane phenylalanine isoleucine leucine ornitine lysine proline phetotyr valtophe Xletophe XletoAla Xletotyr mettophe mettotyr mettoxle mettocit glutocit alatocit orntocit cittophe argtophe argtoala argtoorn cittoarg tyrtocys
statistic 294 373 614 520 750 818 626 864 481 453 1052 304 199 520 740 322 443 548 568 1020 1085 262 870 688 804 775 998 796 1051 809 835 765 636 991 705 1119 1067 547 676 958
parameter NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
p.value 7.470903e-05 0.002892484 0.8682826 0.2131406 0.1664351 0.02941469 0.9771904 0.006423474 0.08963578 0.04288233 2.871616e-07 0.0001262896 1.696178e-07 0.2131406 0.2047685 0.000309769 0.03215244 0.3563652 0.4885482 2.685578e-06 2.102398e-08 1.215316e-05 0.005149363 0.5030627 0.0441031 0.09412582 1.077071e-05 0.05494843 3.092354e-07 0.03827753 0.01735733 0.1193018 0.9407409 1.637282e-05 0.387128 9.531111e-10 9.128393e-08 0.3504003 0.5946418 0.0001026077
null.value 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
alternative two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided
method Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test
data.name x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2 x by total_noSeg$health2
adjusted_p.value_hochberg 0.002241271 0.07520457 0.9771904 0.9771904 0.9771904 0.6471231 0.9771904 0.1541634 0.9771904 0.7938558 1.033782e-05 0.003536108 6.275859e-06 0.9771904 0.9771904 0.008363764 0.6752013 0.9771904 0.9771904 9.130966e-05 8.199353e-07 0.000388901 0.1287341 0.9771904 0.7938558 0.9771904 0.0003554335 0.9341233 1.082324e-05 0.7655506 0.3992185 0.9771904 0.9771904 0.0005075574 0.9771904 3.812444e-08 3.468789e-06 0.9771904 0.9771904 0.002975622
adjusted_p.value_fdr 0.0002716692 0.00771329 0.9139817 0.2750201 0.2377644 0.06192566 0.9771904 0.01511406 0.1434173 0.07670104 2.061569e-06 0.0003885833 1.696178e-06 0.2750201 0.2750201 0.0008850544 0.06430489 0.4319578 0.5583408 1.534616e-05 4.204796e-07 5.401403e-05 0.01287341 0.5589585 0.07670104 0.144809 5.385356e-05 0.09158071 2.061569e-06 0.07290958 0.03857183 0.1767434 0.9648624 6.549127e-05 0.4554448 3.812444e-08 1.217119e-06 0.4319578 0.642856 0.0003420255
ill_mean 4.052947 25.71046 34.53927 65.11522 446.2532 347.6143 90.41223 41.24307 23.52489 317.1558 52.79352 59.25787 240.2853 180.0727 23.17314 37.60369 39.23904 89.37466 75.96696 39.18057 105.1448 153.963 0.6754138 4.833879 4.285784 0.5466366 2.819966 0.6376018 0.4019113 0.145391 1.044568 20.88475 13.68517 1.873045 0.6207717 1.38737 0.176677 1.466339 0.3726656 0.3249112
healthy_mean 3.124116 16.89225 35.607 60.76664 499.0092 433.6578 91.01064 50.49853 21.41336 276.4907 76.72541 45.13108 106.5219 160.3457 24.21595 30.74202 34.33406 82.61671 71.83639 60.22598 156.3828 111.1731 0.8031685 4.699328 4.57674 0.5917922 3.650282 0.688878 0.5786692 0.1591727 1.195207 23.70645 14.02739 2.731364 0.6419407 2.311366 0.296098 1.388073 0.386422 0.5218794

kolmogorov-smirnov test on all the columns(both aminoacids and proportions) vs health2

ill_healthy_ks=as.data.frame(sapply(total_noSeg[, c(6:45)], function(x)
  ks.test(x[total_noSeg$health2=='ILL'], x[total_noSeg$health2=='Healthy'])
   ))
ill_healthy_ks = rbind(ill_healthy_ks ,  adjusted_p.value_hoshberg = p.adjust(unlist(ill_healthy_ks[2,]), method = 'hochberg'))

ill_healthy_ks = rbind(ill_healthy_ks ,  adjusted_p.value_fdr = p.adjust(unlist(ill_healthy_ks[2,]), method = 'fdr'))

ill_healthy_ks = rbind(ill_healthy_ks, ill_mean =colMeans(total_noSeg[total_noSeg$health2=='ILL', 6:45]) )
ill_healthy_ks = rbind(ill_healthy_ks, healthy_mean =colMeans(total_noSeg[total_noSeg$health2=='Healthy', 6:45]) )

kable(ill_healthy_ks)
aspartate glutamate asparagine serine glutamine glycine theronine histithine citrolline alanine argenine tyrosine cysteine valine methionine tryptophane phenylalanine isoleucine leucine ornitine lysine proline phetotyr valtophe Xletophe XletoAla Xletotyr mettophe mettotyr mettoxle mettocit glutocit alatocit orntocit cittophe argtophe argtoala argtoorn cittoarg tyrtocys
statistic 0.4721781 0.4570747 0.13593 0.2416534 0.2440382 0.2535771 0.145469 0.4324324 0.2559618 0.3076312 0.6073132 0.4753577 0.6931638 0.1995231 0.1891892 0.4944356 0.2686804 0.1868045 0.1677266 0.5039746 0.627186 0.5015898 0.3449921 0.2265501 0.4252782 0.3418124 0.4602544 0.3052464 0.600159 0.2710652 0.4205087 0.2662957 0.1414944 0.5580286 0.182035 0.7154213 0.6025437 0.2074722 0.2265501 0.445151
p.value 0.000391774 0.000691112 0.8323904 0.2061064 0.1978191 0.1658162 0.7835548 0.001592514 0.1579749 0.05372972 1.219774e-06 0.0003315605 1.021109e-08 0.4112498 0.4685635 0.0001759579 0.1231664 0.4864487 0.6204319 0.0001135205 4.947073e-07 0.0001249568 0.02095612 0.2685011 0.002019555 0.02290865 0.0006351303 0.05696566 1.927058e-06 0.1179892 0.002367018 0.1286008 0.8023009 1.100577e-05 0.5241761 3.439466e-09 1.710893e-06 0.3656205 0.2685011 0.001035115
alternative two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided two-sided
method Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test Two-sample Kolmogorov-Smirnov test
data.name x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”] x[total_noSeg$health2 == “ILL”] and x[total_noSeg$health2 == “Healthy”]
adjusted_p.value_hoshberg 0.01136145 0.01866002 0.8323904 0.8323904 0.8323904 0.8323904 0.8323904 0.03981285 0.8323904 0.8323904 4.513165e-05 0.009946816 3.982327e-07 0.8323904 0.8323904 0.005454695 0.8323904 0.8323904 0.8323904 0.003746177 1.879888e-05 0.003998618 0.4610347 0.8323904 0.04846931 0.4810816 0.01778365 0.8323904 6.744703e-05 0.8323904 0.05444141 0.8323904 0.8323904 0.0003741963 0.8323904 1.375786e-07 6.159215e-05 0.8323904 0.8323904 0.026913
adjusted_p.value_fdr 0.001305913 0.001974606 0.8323904 0.2842847 0.2825988 0.2456536 0.8228727 0.003981285 0.2430384 0.1023423 1.219774e-05 0.001205675 2.042219e-07 0.4984846 0.5512511 0.0007038316 0.2052773 0.5559413 0.6707372 0.0005553636 6.596097e-06 0.0005553636 0.04411815 0.3464531 0.004751893 0.04581729 0.001954247 0.1035739 1.284705e-05 0.2051986 0.00526004 0.2057613 0.8228727 6.289013e-05 0.5824179 1.375786e-07 1.284705e-05 0.4570257 0.3464531 0.002760308
ill_mean 4.052947 25.71046 34.53927 65.11522 446.2532 347.6143 90.41223 41.24307 23.52489 317.1558 52.79352 59.25787 240.2853 180.0727 23.17314 37.60369 39.23904 89.37466 75.96696 39.18057 105.1448 153.963 0.6754138 4.833879 4.285784 0.5466366 2.819966 0.6376018 0.4019113 0.145391 1.044568 20.88475 13.68517 1.873045 0.6207717 1.38737 0.176677 1.466339 0.3726656 0.3249112
healthy_mean 3.124116 16.89225 35.607 60.76664 499.0092 433.6578 91.01064 50.49853 21.41336 276.4907 76.72541 45.13108 106.5219 160.3457 24.21595 30.74202 34.33406 82.61671 71.83639 60.22598 156.3828 111.1731 0.8031685 4.699328 4.57674 0.5917922 3.650282 0.688878 0.5786692 0.1591727 1.195207 23.70645 14.02739 2.731364 0.6419407 2.311366 0.296098 1.388073 0.386422 0.5218794

kruskal-Wallis test on all columns vs nonsegmentale(vosate lake)

nonsegmental_kw = as.data.frame(sapply(total_noSeg[, c(6:45)], function(x)
  kruskal.test(x~total_noSeg$nonsegmentale)))

nonsegmental_kw = rbind(nonsegmental_kw ,  adjusted_p.value_hochberg = p.adjust(unlist(nonsegmental_kw[3,]), method = 'hochberg'))

nonsegmental_kw = rbind(nonsegmental_kw ,  adjusted_p.value_fdr = p.adjust(unlist(nonsegmental_kw[3,]), method = 'fdr'))

nonsegmental_kw = rbind(nonsegmental_kw, control_mean =colMeans(total_noSeg[total_noSeg$nonsegmentale==0, 6:45]) )

nonsegmental_kw = rbind(nonsegmental_kw, khafif_mean =colMeans(total_noSeg[total_noSeg$nonsegmentale==1, 6:45]) )

nonsegmental_kw = rbind(nonsegmental_kw, shadid_mean =colMeans(total_noSeg[total_noSeg$nonsegmentale==2, 6:45]) )

kable(nonsegmental_kw)
aspartate glutamate asparagine serine glutamine glycine theronine histithine citrolline alanine argenine tyrosine cysteine valine methionine tryptophane phenylalanine isoleucine leucine ornitine lysine proline phetotyr valtophe Xletophe XletoAla Xletotyr mettophe mettotyr mettoxle mettocit glutocit alatocit orntocit cittophe argtophe argtoala argtoorn cittoarg tyrtocys
statistic 17.56103 16.33092 0.1847818 2.341535 2.412248 4.904904 0.01178877 7.581861 6.681908 4.156372 24.50807 17.01707 24.49691 4.818431 4.649657 13.29306 6.617138 5.594535 8.153247 21.49123 28.72841 18.57233 7.726989 0.5130945 6.48968 7.247254 19.33717 3.710166 23.5963 4.891572 10.30448 2.598715 0.04063286 19.79508 3.3545 31.82358 25.8012 1.430741 0.358777 14.5681
parameter 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
p.value 0.000153699 0.0002843062 0.9117487 0.3101289 0.2993553 0.08608227 0.994123 0.02257458 0.03540318 0.1251571 4.765857e-06 0.0002017397 4.792505e-06 0.08988577 0.09780024 0.001298518 0.03656847 0.06097646 0.01696465 2.15396e-05 5.777035e-07 9.269788e-05 0.0209945 0.7737184 0.0389748 0.02668572 6.323938e-05 0.1564399 7.51847e-06 0.086658 0.00578643 0.2727069 0.9798886 5.029832e-05 0.1868872 1.229131e-07 2.496548e-06 0.4890109 0.8357811 0.0006864012
method Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test Kruskal-Wallis rank sum test
data.name x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale x by total_noSeg$nonsegmentale
adjusted_p.value_hochberg 0.004610969 0.007960573 0.994123 0.994123 0.994123 0.994123 0.994123 0.4966408 0.6948009 0.994123 0.0001725302 0.00585045 0.0001725302 0.994123 0.994123 0.03376146 0.6948009 0.994123 0.4071516 0.0007323464 2.253044e-05 0.002873634 0.4828736 0.994123 0.7015464 0.5604 0.00202366 0.994123 0.0002631464 0.994123 0.1446607 0.994123 0.994123 0.001659845 0.994123 4.916524e-06 9.486881e-05 0.994123 0.994123 0.01853283
adjusted_p.value_fdr 0.0005589053 0.0008747882 0.9597354 0.3648575 0.362855 0.1331641 0.994123 0.04752544 0.06648812 0.1726304 3.834004e-05 0.0006724655 3.834004e-05 0.1331641 0.1397146 0.003462714 0.06648812 0.1016274 0.03991682 0.0001230834 1.155407e-05 0.0003707915 0.04665445 0.8596871 0.06778226 0.05337143 0.0002810639 0.2085866 5.012313e-05 0.1331641 0.01446607 0.3408837 0.994123 0.0002514916 0.2411448 4.916524e-06 3.32873e-05 0.5588696 0.9035472 0.001961146
control_mean 3.124116 16.89225 35.607 60.76664 499.0092 433.6578 91.01064 50.49853 21.41336 276.4907 76.72541 45.13108 106.5219 160.3457 24.21595 30.74202 34.33406 82.61671 71.83639 60.22598 156.3828 111.1731 0.8031685 4.699328 4.57674 0.5917922 3.650282 0.688878 0.5786692 0.1591727 1.195207 23.70645 14.02739 2.731364 0.6419407 2.311366 0.296098 1.388073 0.386422 0.5218794
khafif_mean 4.351811 31.83114 33.82565 62.58102 452.0396 368.7695 90.92358 41.08363 25.23839 316.9333 55.10125 62.89814 241.4277 191.2362 24.36319 38.52066 41.42105 98.41708 83.019 42.34168 112.1734 155.1452 0.6725561 4.747437 4.472902 0.6002664 2.930733 0.6390026 0.3970949 0.1435599 1.164709 20.81183 13.96961 2.114185 0.5712807 1.376055 0.1845871 1.45169 0.3845713 0.32802
shadid_mean 3.737478 19.24975 35.29254 67.79022 440.1453 325.2838 89.87248 41.41137 21.7162 317.3907 50.35759 55.41536 239.0795 168.2889 21.91696 36.63578 36.9358 79.82987 68.52315 35.84384 97.7258 152.715 0.6784302 4.925123 4.08827 0.4900274 2.703046 0.6361233 0.4069953 0.1473239 0.9177522 20.96172 13.38492 1.618508 0.6730123 1.399314 0.1683273 1.481802 0.3600984 0.3216298

Mann-Whitney & Kolmogrov-Smirnov test on all columns vs recoddiseas2(modate bimari)

data = total_noSeg %>% filter(recoddiseas2>0)

recoddiseas2_mw = as.data.frame(sapply(data[, c(6:45)], function(x)
  wilcox.test(x~data$recoddiseas2)))

recoddiseas2_mw = rbind(recoddiseas2_mw ,  adjusted_p.value_fdr = p.adjust(unlist(recoddiseas2_mw[3,]), method = 'fdr'))

recoddiseas2_mw = rbind(recoddiseas2_mw ,  adjusted_p.value_hochberg = p.adjust(unlist(recoddiseas2_mw[3,]), method = 'hochberg'))


recoddiseas2_mw = rbind(recoddiseas2_mw, less_six =colMeans(total_noSeg[total_noSeg$recoddiseas2==1, 6:45] ))

recoddiseas2_mw = rbind(recoddiseas2_mw, more_six =colMeans(total_noSeg[total_noSeg$recoddiseas2==2, 6:45]) )



kable(recoddiseas2_mw)
aspartate glutamate asparagine serine glutamine glycine theronine histithine citrolline alanine argenine tyrosine cysteine valine methionine tryptophane phenylalanine isoleucine leucine ornitine lysine proline phetotyr valtophe Xletophe XletoAla Xletotyr mettophe mettotyr mettoxle mettocit glutocit alatocit orntocit cittophe argtophe argtoala argtoorn cittoarg tyrtocys
statistic 179 172 136 125 135 172 105 184 141 128 158 119 141 120 120 119 118 116 134 148 157 210 189 165 157 150 180 202 190 190 171 196 180 167 172 209 177 173 221 163
parameter NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
p.value 0.8218698 0.9880272 0.2983262 0.168745 0.284413 0.9880272 0.04563677 0.7073835 0.3744164 0.1990715 0.7073835 0.1184626 0.3744164 0.1259344 0.1259344 0.1184626 0.1113344 0.09807196 0.2709349 0.4986944 0.6851533 0.2452772 0.5990186 0.8688752 0.6851533 0.537755 0.7985884 0.3583325 0.5782569 0.5782569 1 0.4611587 0.7985884 0.9163372 0.9880272 0.2578903 0.8688752 0.964092 0.1337585 0.8218698
null.value 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
alternative two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided
method Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test
data.name x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2 x by data$recoddiseas2
adjusted_p.value_fdr 1 1 0.7955365 0.7499777 0.7955365 1 0.6687924 1 0.8320365 0.7955365 1 0.6687924 0.8320365 0.6687924 0.6687924 0.6687924 0.6687924 0.6687924 0.7955365 0.9973889 1 0.7955365 0.9983643 1 1 0.9983643 1 0.8320365 0.9983643 0.9983643 1 0.9708604 1 1 1 0.7955365 1 1 0.6687924 1
adjusted_p.value_hochberg 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
less_six 4.035562 25.69114 33.08678 60.72593 432.7207 360.8517 81.94685 43.29987 22.84186 301.425 52.30318 55.12769 224.6137 161.8015 22.13957 35.06458 36.90545 82.90511 71.732 39.1252 106.0426 157.5622 0.6792645 4.644915 4.259247 0.5233265 2.820476 0.6589611 0.4056015 0.1441502 1.055709 22.14809 13.97757 2.042163 0.6298461 1.452841 0.1799292 1.51121 0.403157 0.3324435
more_six 4.071297 25.73086 36.07246 69.74836 460.5375 333.6415 99.34791 39.072 24.24586 333.7605 53.31111 63.6175 256.8275 199.3589 24.26413 40.28387 41.70226 96.20362 80.4372 39.23901 104.1972 150.1638 0.6713491 5.033341 4.313796 0.5712417 2.819428 0.6150559 0.3980161 0.1467008 1.032808 19.55123 13.37652 1.694532 0.6111932 1.318261 0.1732441 1.418975 0.3404802 0.3169604

Mann-Whitney & Kolmogrov-Smirnov test on all columns vs familyhistory

data = total_noSeg %>% filter(!is.na(familyhistory))

familyHistory_mw = as.data.frame(sapply(data[, c(6:45)], function(x)
  wilcox.test(x~data$familyhistory)))

familyHistory_mw = rbind(familyHistory_mw ,  adjusted_p.value_hochberg = p.adjust(unlist(familyHistory_mw[3,]), method = 'hochberg'))

familyHistory_mw = rbind(familyHistory_mw ,  adjusted_p.value_fdr = p.adjust(unlist(familyHistory_mw[3,]), method = 'fdr'))

familyHistory_mw = rbind(familyHistory_mw, no_hist =colMeans(total_noSeg[total_noSeg$familyhistory==0, 6:45], na.rm = T) )

familyHistory_mw = rbind(familyHistory_mw, yes_hist =colMeans(total_noSeg[total_noSeg$familyhistory==1, 6:45], na.rm = T) )

kable(familyHistory_mw)
aspartate glutamate asparagine serine glutamine glycine theronine histithine citrolline alanine argenine tyrosine cysteine valine methionine tryptophane phenylalanine isoleucine leucine ornitine lysine proline phetotyr valtophe Xletophe XletoAla Xletotyr mettophe mettotyr mettoxle mettocit glutocit alatocit orntocit cittophe argtophe argtoala argtoorn cittoarg tyrtocys
statistic 113 107 107 130 137 170 67 132 143 116 156 129 119 134 118 92 124 131 143 165 157 100 145 144 157 206 148 190 147 159 173 198 123 174 151 184 216 142 169 188
parameter NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
p.value 0.178776 0.1239551 0.1239551 0.4223417 0.5610465 0.6716575 0.003845478 0.4598452 0.6947405 0.2119702 1 0.4042639 0.2492676 0.4990909 0.2363706 0.04224361 0.3208436 0.4408712 0.6947405 0.7896609 0.9874847 0.07727561 0.7417229 0.7181022 0.9874847 0.1162177 0.813936 0.2907876 0.7896609 0.9374838 0.6042625 0.1893946 0.3055774 0.5824727 0.8877215 0.3866445 0.05764172 0.6716575 0.6947405 0.3208436
null.value 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
alternative two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided
method Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test
data.name x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory x by data$familyhistory
adjusted_p.value_hochberg 1 1 1 1 1 1 0.1538191 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
adjusted_p.value_fdr 0.8021091 0.7083147 0.7083147 0.8758956 0.9263206 0.9263206 0.1538191 0.8758956 0.9263206 0.8021091 1 0.8758956 0.8021091 0.9074381 0.8021091 0.7083147 0.8021091 0.8758956 0.9263206 0.9290128 1 0.7083147 0.9271537 0.9265835 1 0.7083147 0.9302126 0.8021091 0.9290128 1 0.9263206 0.8021091 0.8021091 0.9263206 0.9863572 0.8758956 0.7083147 0.9263206 0.9263206 0.8021091
no_hist 3.834042 23.9139 32.5948 62.12807 439.5452 349.563 82.02316 37.29338 23.29817 307.0609 52.41527 57.80956 226.8009 179.0759 22.48374 35.32728 38.20443 86.46151 75.11564 38.67737 104.3731 143.6107 0.6728893 4.826842 4.320725 0.5675903 2.824321 0.6507367 0.3993928 0.149516 1.087842 22.41097 13.22891 1.932903 0.6174791 1.426197 0.1871778 1.472401 0.3756081 0.3463424
yes_hist 4.457078 29.02719 38.12906 70.62997 458.6372 344.0167 105.8998 48.53481 23.94345 335.7926 53.49183 61.93166 265.1796 181.9129 24.44587 41.80631 41.14907 94.75278 77.53863 40.10955 106.5697 173.0748 0.6800743 4.846869 4.221278 0.5079527 2.811926 0.6133527 0.4065609 0.1377757 0.9646776 18.06712 14.52749 1.762538 0.6268505 1.315689 0.1572908 1.455148 0.3672332 0.2853461
# familyHistory_ks = as.data.frame(sapply(data[, 6:45], function(x)
#   ks.test(x~data$familyhistory)))

Mann-Whitney & Kolmogrov-Smirnov test on all columns vs newlesion

data = total_noSeg %>% filter(!is.na(newlesion))
newlesion_mw = as.data.frame(sapply(data[, c(6:45)], function(x)
  wilcox.test(x~data$newlesion)))


newlesion_mw = rbind(newlesion_mw ,  adjusted_p.value_fdr = p.adjust(unlist(newlesion_mw[3,]), method = 'fdr'))

newlesion_mw = rbind(newlesion_mw ,  adjusted_p.value_hochberg = p.adjust(unlist(newlesion_mw[3,]), method = 'hochberg'))

newlesion_mw = rbind(newlesion_mw, no_new_lesion =colMeans(total_noSeg[total_noSeg$recoddiseas2==0, 6:45], na.rm = T) )

newlesion_mw = rbind(newlesion_mw, yes_new_lesion =colMeans(total_noSeg[total_noSeg$recoddiseas2==1, 6:45], na.rm = T) )


kable(newlesion_mw)
aspartate glutamate asparagine serine glutamine glycine theronine histithine citrolline alanine argenine tyrosine cysteine valine methionine tryptophane phenylalanine isoleucine leucine ornitine lysine proline phetotyr valtophe Xletophe XletoAla Xletotyr mettophe mettotyr mettoxle mettocit glutocit alatocit orntocit cittophe argtophe argtoala argtoorn cittoarg tyrtocys
statistic 152 171 105 121 124 154 158 158 136 144 105 131 166 155 156 189 171 130 133 125 112 156 186 158 129 138 151 147 188 184 109 166 159 120 213 94 123 172 152 147
parameter NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
p.value 0.7927673 0.7690258 0.08216865 0.219181 0.2566871 0.8407945 0.9384257 0.9384257 0.4478842 0.6097609 0.08216865 0.3602474 0.8894059 0.865038 0.8894059 0.3939649 0.7690258 0.3440713 0.3939649 0.2700989 0.1300296 0.8894059 0.4478842 0.9384257 0.328354 0.4859826 0.7690258 0.6763469 0.4114979 0.4859826 0.1074387 0.8894059 0.9630324 0.2075762 0.1074387 0.03609037 0.2437323 0.7454936 0.7927673 0.6763469
null.value 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
alternative two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided
method Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test
data.name x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion x by data$newlesion
adjusted_p.value_fdr 0.9624879 0.9624879 0.8595095 0.9256812 0.9256812 0.9624879 0.9624879 0.9624879 0.9256812 0.9624879 0.8595095 0.9256812 0.9624879 0.9624879 0.9624879 0.9256812 0.9624879 0.9256812 0.9256812 0.9256812 0.8668638 0.9624879 0.9256812 0.9624879 0.9256812 0.9256812 0.9624879 0.9624879 0.9256812 0.9256812 0.8595095 0.9624879 0.9630324 0.9256812 0.8595095 0.8595095 0.9256812 0.9624879 0.9624879 0.9624879
adjusted_p.value_hochberg 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324 0.9630324
no_new_lesion 3.124116 16.89225 35.607 60.76664 499.0092 433.6578 91.01064 50.49853 21.41336 276.4907 76.72541 45.13108 106.5219 160.3457 24.21595 30.74202 34.33406 82.61671 71.83639 60.22598 156.3828 111.1731 0.8031685 4.699328 4.57674 0.5917922 3.650282 0.688878 0.5786692 0.1591727 1.195207 23.70645 14.02739 2.731364 0.6419407 2.311366 0.296098 1.388073 0.386422 0.5218794
yes_new_lesion 4.035562 25.69114 33.08678 60.72593 432.7207 360.8517 81.94685 43.29987 22.84186 301.425 52.30318 55.12769 224.6137 161.8015 22.13957 35.06458 36.90545 82.90511 71.732 39.1252 106.0426 157.5622 0.6792645 4.644915 4.259247 0.5233265 2.820476 0.6589611 0.4056015 0.1441502 1.055709 22.14809 13.97757 2.042163 0.6298461 1.452841 0.1799292 1.51121 0.403157 0.3324435
# newlesion_ks = as.data.frame(sapply(data[, 6:45], function(x)
#   ks.test(x~data$newlesion)))

Mann-Whitney & Kolmogrov-Smirnov test on all columns vs recodnonseg2

data = total_noSeg %>% filter(!is.na(recodnonseg2))
recodnonseg2_mw = as.data.frame(sapply(data[, c(6:45)], function(x)
  wilcox.test(x~data$recodnonseg2)))


recodnonseg2_mw = rbind(recodnonseg2_mw ,  adjusted_p.value_fdr = p.adjust(unlist(recodnonseg2_mw[3,]), method = 'fdr'))

recodnonseg2_mw = rbind(recodnonseg2_mw ,  adjusted_p.value_hochberg = p.adjust(unlist(recodnonseg2_mw[3,]), method = 'hochberg'))

recodnonseg2_mw = rbind(recodnonseg2_mw, khafif_mean =colMeans(total_noSeg[total_noSeg$recodnonseg2==0, 6:45], na.rm = T) )

recodnonseg2_mw = rbind(recodnonseg2_mw, shadid_mean =colMeans(total_noSeg[total_noSeg$recodnonseg2==1, 6:45], na.rm = T) )


kable(recodnonseg2_mw)
aspartate glutamate asparagine serine glutamine glycine theronine histithine citrolline alanine argenine tyrosine cysteine valine methionine tryptophane phenylalanine isoleucine leucine ornitine lysine proline phetotyr valtophe Xletophe XletoAla Xletotyr mettophe mettotyr mettoxle mettocit glutocit alatocit orntocit cittophe argtophe argtoala argtoorn cittoarg tyrtocys
statistic 225 264 152 139 202 180 171 154 237 170 208 239 162 223 223 194 215 237 257 207 203 187 164 177 213 232 207 174 163 145 229 153 173 212 114 164 195 139 184 198
parameter NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL NULL
p.value 0.1045406 0.004033456 0.5782569 0.3426794 0.3583325 0.7985884 1 0.6201035 0.04563677 0.9880272 0.2709349 0.03920604 0.7985884 0.1184626 0.1184626 0.4986944 0.1885597 0.04563677 0.008219927 0.284413 0.3426794 0.6414975 0.8453054 0.8688752 0.2099941 0.06555546 0.284413 0.9401884 0.8218698 0.442985 0.08052259 0.5990186 0.964092 0.2213329 0.08607255 0.8453054 0.4797312 0.3426794 0.7073835 0.4252178
null.value 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
alternative two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided two.sided
method Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test Wilcoxon rank sum test
data.name x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2 x by data$recodnonseg2
adjusted_p.value_fdr 0.4307732 0.1613383 0.8848242 0.682538 0.682538 0.9654168 1 0.8848242 0.3650942 1 0.6692071 0.3650942 0.9654168 0.4307732 0.4307732 0.7979111 0.6285324 0.3650942 0.1643985 0.6692071 0.682538 0.8848242 0.9654168 0.9654168 0.6323797 0.4303628 0.6692071 1 0.9654168 0.7704087 0.4303628 0.8848242 1 0.6323797 0.4303628 0.9654168 0.7979111 0.682538 0.943178 0.7704087
adjusted_p.value_hochberg 1 0.1613383 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0.3205772 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
khafif_mean 4.351811 31.83114 33.82565 62.58102 452.0396 368.7695 90.92358 41.08363 25.23839 316.9333 55.10125 62.89814 241.4277 191.2362 24.36319 38.52066 41.42105 98.41708 83.019 42.34168 112.1734 155.1452 0.6725561 4.747437 4.472902 0.6002664 2.930733 0.6390026 0.3970949 0.1435599 1.164709 20.81183 13.96961 2.114185 0.5712807 1.376055 0.1845871 1.45169 0.3845713 0.32802
shadid_mean 3.737478 19.24975 35.29254 67.79022 440.1453 325.2838 89.87248 41.41137 21.7162 317.3907 50.35759 55.41536 239.0795 168.2889 21.91696 36.63578 36.9358 79.82987 68.52315 35.84384 97.7258 152.715 0.6784302 4.925123 4.08827 0.4900274 2.703046 0.6361233 0.4069953 0.1473239 0.9177522 20.96172 13.38492 1.618508 0.6730123 1.399314 0.1683273 1.481802 0.3600984 0.3216298
# newlesion_ks = as.data.frame(sapply(data[, 6:45], function(x)
#   ks.test(x~data$newlesion)))

heatmap & dendrograms with health2 as labels

heatmap(as.matrix(total_noSeg[, c(6:27)]), scale="column", labRow  = total_noSeg$health2)

heatmap & dendrograms with newlesion as labels

heatmap(as.matrix(total_noSeg[, c(6:27)]), scale="column", labRow  = total_noSeg$newlesion)

heatmap & dendrograms with familyHistory as labels

heatmap(as.matrix(total_noSeg[, c(6:27)], scale="column", labRow  = total_noSeg$familyhistory))

heatmap & dendrograms with nonsegmentale as labels

heatmap(as.matrix(total_noSeg[, c(6:27)], scale="column", labRow  = total_noSeg$nonsegmentale))

heatmap & dendrograms with recoddiseas2 as labels

heatmap(as.matrix(total_noSeg[, c(6:27)], scale="column", labRow  = total_noSeg$recoddiseas2))

random forest(feature selection)

data = total_noSeg[, c(4,6:27)]
data$health2 = as.factor(data$health2)
output.forest <- randomForest(health2 ~ ., 
           data )
print(output.forest)
## 
## Call:
##  randomForest(formula = health2 ~ ., data = data) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 9.86%
## Confusion matrix:
##         Healthy ILL class.error
## Healthy      34   0   0.0000000
## ILL           7  30   0.1891892
print(output.forest$confusion)
##         Healthy ILL class.error
## Healthy      34   0   0.0000000
## ILL           7  30   0.1891892
importance =data.frame(name = row.names(output.forest$importance),MeanDecreaseGini = output.forest$importance)
knitr::kable(arrange(importance, desc(MeanDecreaseGini)))
name MeanDecreaseGini
lysine 4.4508826
argenine 4.1735678
cysteine 4.0230278
ornitine 3.0911975
tyrosine 2.9597111
proline 1.8405242
aspartate 1.6779623
tryptophane 1.5600716
glutamate 1.4869353
histithine 1.2569367
glutamine 1.0023942
methionine 0.9501119
glycine 0.8812520
asparagine 0.7936855
phenylalanine 0.7179538
serine 0.7052845
valine 0.6020607
alanine 0.6000511
isoleucine 0.5802141
theronine 0.5590213
citrolline 0.5186358
leucine 0.5049686

plot Random Forest Model

plot(output.forest)

heatmap with most important features(no proportions)

most_important_features = importance %>% filter(MeanDecreaseGini>1)
data=  total_noSeg[ , which(names(total_noSeg) %in% most_important_features$name)]

heatmap(as.matrix(data),scale="column", labRow = total_noSeg$health2)

boxplots for most important features

for (i in 1:ncol(data)){
  print(i)
  name = colnames(data)[i]
  plot = ggplot(data, aes(x = total_noSeg$health2, y = data[, i])) +
  geom_boxplot()+ ylab(name)+ xlab('health')
  print(plot)
}
## [1] 1

## [1] 2

## [1] 3

## [1] 4

## [1] 5

## [1] 6

## [1] 7

## [1] 8

## [1] 9

## [1] 10

## [1] 11

random forest with proportions(feature selection)

data = total_noSeg[, c(4,6:45)]
data$health2 = as.factor(data$health2)
output.forest_prop <- randomForest(health2 ~ ., 
           data )

print(output.forest_prop)
## 
## Call:
##  randomForest(formula = health2 ~ ., data = data) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 12.68%
## Confusion matrix:
##         Healthy ILL class.error
## Healthy      31   3  0.08823529
## ILL           6  31  0.16216216
print(output.forest_prop$confusion)
##         Healthy ILL class.error
## Healthy      31   3  0.08823529
## ILL           6  31  0.16216216
importance_prop =data.frame(name = row.names(output.forest_prop$importance),MeanDecreaseGini = output.forest_prop$importance)
importance_prop = arrange(importance_prop, desc(MeanDecreaseGini))
knitr::kable(importance_prop)
name MeanDecreaseGini
argtophe 4.2677499
lysine 3.2539367
cysteine 3.0031481
argenine 2.2609431
argtoala 2.0338725
mettotyr 1.8357484
orntocit 1.7089554
ornitine 1.6020697
Xletotyr 1.0910564
tyrosine 1.0520867
proline 1.0296687
tryptophane 0.9557608
aspartate 0.9556368
glutamate 0.8260966
Xletophe 0.7189635
tyrtocys 0.7040800
histithine 0.6904941
argtoorn 0.5951173
mettocit 0.4921165
glutamine 0.4588401
glutocit 0.3578793
XletoAla 0.3409359
methionine 0.3345465
leucine 0.3299992
phetotyr 0.3240922
asparagine 0.3167259
valine 0.3112363
glycine 0.2853957
cittophe 0.2826849
alanine 0.2813152
mettoxle 0.2809446
cittoarg 0.2773493
valtophe 0.2732817
serine 0.2604170
citrolline 0.2462431
isoleucine 0.2085660
alatocit 0.2016680
theronine 0.1935923
phenylalanine 0.1854059
mettophe 0.1554923

plot Random Forest Model (with proportions)

plot(output.forest_prop)

heatmap with most important features(no proportions)

most_important_features = importance_prop %>% filter(MeanDecreaseGini>1)
data=  total_noSeg[ , which(names(total_noSeg) %in% most_important_features$name)]

heatmap(as.matrix(data),scale="column", labRow = total_noSeg$health2)

boxplots for most important features

for (i in 1:ncol(data)){
  print(i)
  name = colnames(data)[i]
  plot = ggplot(data, aes(x = total_noSeg$health2, y = data[, i])) +
  geom_boxplot()+ ylab(name)+ xlab('health')
  print(plot)
}
## [1] 1

## [1] 2

## [1] 3

## [1] 4

## [1] 5

## [1] 6

## [1] 7

## [1] 8

## [1] 9

## [1] 10

## [1] 11

heatmap with most important features(proportions are included)

most_important_features = importance_prop %>% filter(MeanDecreaseGini>1)
data=  total_noSeg[ , which(names(total_noSeg) %in% most_important_features$name)]

heatmap(as.matrix(data),scale="column", labRow = total_noSeg$health2)

predict with glm

data$health2 = total_noSeg$health2
data$health2 [data$health2=='ILL']= 1
data$health2 [data$health2=='Healthy']= 0
data$health2= as.numeric(data$health2)
model = glm( data=data, health2~., binomial(link='logit'))


print(summary(model))
## 
## Call:
## glm(formula = health2 ~ ., family = binomial(link = "logit"), 
##     data = data)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.21439  -0.22762   0.01774   0.11962   2.46490  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept) 10.437944   8.621792   1.211   0.2260  
## argenine    -0.078095   0.073915  -1.057   0.2907  
## tyrosine     0.047833   0.103108   0.464   0.6427  
## cysteine     0.018071   0.010326   1.750   0.0801 .
## ornitine    -0.019864   0.048774  -0.407   0.6838  
## lysine      -0.018213   0.024747  -0.736   0.4617  
## proline     -0.004119   0.012225  -0.337   0.7361  
## Xletotyr    -1.022730   1.294238  -0.790   0.4294  
## mettotyr    -0.816744   8.157086  -0.100   0.9202  
## orntocit    -0.623229   0.913357  -0.682   0.4950  
## argtophe    -1.957933   2.146485  -0.912   0.3617  
## argtoala     8.712359  10.951594   0.796   0.4263  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 98.300  on 70  degrees of freedom
## Residual deviance: 23.613  on 59  degrees of freedom
## AIC: 47.613
## 
## Number of Fisher Scoring iterations: 7
predict_data = data.frame(true=data$health2 ,predicted = ifelse(fitted(model) > 0.5, '1', '0'), m = fitted(model))
predict_data$res =  predict_data$predicted==predict_data$true
(sum(predict_data$res)/ nrow(predict_data))*100
## [1] 94.3662

RocPlot for glm model

basicplot <- ggplot(predict_data, aes(d = true, m = m)) + geom_roc(n.cuts = 0, labels = T)
basicplot+ annotate("text", x = .75, y = .25, label = paste("AUC =", round(calc_auc(basicplot)$AUC, 2)))

pairwise scatter plot for most important features

most_important_features = importance_prop %>% filter(MeanDecreaseGini>1)
data=  total_noSeg[ , which(names(total_noSeg) %in% most_important_features$name)]

data$health2 = total_noSeg$health2
ggpairs(data=data,             # data.frame with variables
           mapping = ggplot2::aes(colour = as.factor(health2)),
lower = list(continuous = wrap("smooth", method = "lm")),
             columns=1:ncol(data),           # columns to plot, default to all.
           title=("Vitiligo data")    # title of the plot
)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

RandomForest for nonsegmentale (Feature Selection)

data = total_noSeg[, c(6:45)]
data$nonsegmentale = as.factor(total_noSeg$nonsegmentale)
output.forest_nonseg <- randomForest(nonsegmentale ~ ., data= data )
print(output.forest_nonseg)
## 
## Call:
##  randomForest(formula = nonsegmentale ~ ., data = data) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 28.17%
## Confusion matrix:
##    0 1  2 class.error
## 0 34 0  0   0.0000000
## 1  5 6  8   0.6842105
## 2  3 4 11   0.3888889
print(output.forest_nonseg$confusion)
##    0 1  2 class.error
## 0 34 0  0   0.0000000
## 1  5 6  8   0.6842105
## 2  3 4 11   0.3888889
importance_nonSeg =data.frame(name = row.names(output.forest_nonseg$importance),MeanDecreaseGini = output.forest_nonseg$importance)
importance_nonSeg = arrange(importance_nonSeg, desc(MeanDecreaseGini))
knitr::kable(importance_nonSeg)
name MeanDecreaseGini
argtophe 3.6202689
lysine 3.4626188
glutamate 2.5802438
cysteine 2.5042770
argenine 2.0677510
orntocit 1.9359773
mettotyr 1.7553453
ornitine 1.7343827
argtoala 1.6192344
Xletotyr 1.3525380
leucine 1.2547284
aspartate 1.2426284
tyrtocys 1.1649300
tyrosine 1.1017939
Xletophe 1.0276210
proline 1.0252972
mettocit 0.9864488
histithine 0.9040171
tryptophane 0.8891480
asparagine 0.8743952
argtoorn 0.8494314
XletoAla 0.8211537
citrolline 0.7043045
isoleucine 0.6804186
methionine 0.6482214
valine 0.6096959
phetotyr 0.5938201
glutamine 0.5904713
cittoarg 0.5871350
phenylalanine 0.5702291
serine 0.5649915
valtophe 0.5509432
glutocit 0.5119671
theronine 0.4752374
glycine 0.4605956
cittophe 0.4458480
mettophe 0.4432158
mettoxle 0.4315571
alatocit 0.3778848
alanine 0.3387265

plot random forest model for nonsegmentale

plot(output.forest_nonseg)

heatmap with most important features for nonsegmentale(proportions are included)

most_important_features = importance_nonSeg %>% filter(MeanDecreaseGini>1)
data=  total_noSeg[ , which(names(total_noSeg) %in% most_important_features$name)]

heatmap(as.matrix(data),scale="column", labRow = total_noSeg$nonsegmentale)

boxplots for most important features

for (i in 1:ncol(data)){
  print(i)
  name = colnames(data)[i]
  plot = ggplot(data, aes(x = as.factor(total_noSeg$nonsegmentale), y = data[, i])) +
  geom_boxplot()+ ylab(name)+ xlab('nonsegmentale')
  print(plot)
}
## [1] 1

## [1] 2

## [1] 3

## [1] 4

## [1] 5

## [1] 6

## [1] 7

## [1] 8

## [1] 9

## [1] 10

## [1] 11

## [1] 12

## [1] 13

## [1] 14

## [1] 15

## [1] 16

RandomForest for recodnonseg2 (Feature Selection)

data = total_noSeg[!is.na(total_noSeg$recodnonseg2), c(6:45)]
data$recodnonseg2 = as.factor(total_noSeg[!is.na(total_noSeg$recodnonseg2), ]$recodnonseg2)
output.forest_nonseg2 <- randomForest(recodnonseg2 ~ ., data= data )
print(output.forest_nonseg2)
## 
## Call:
##  randomForest(formula = recodnonseg2 ~ ., data = data) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 40.54%
## Confusion matrix:
##   0  1 class.error
## 0 9 10   0.5263158
## 1 5 13   0.2777778
print(output.forest_nonseg2$confusion)
##   0  1 class.error
## 0 9 10   0.5263158
## 1 5 13   0.2777778
importance_nonSeg2 =data.frame(name = row.names(output.forest_nonseg2$importance),MeanDecreaseGini = output.forest_nonseg2$importance)
importance_nonSeg2 = arrange(importance_nonSeg2, desc(MeanDecreaseGini))
knitr::kable(importance_nonSeg2)
name MeanDecreaseGini
glutamate 1.3771335
leucine 1.0178354
tyrosine 0.8484063
asparagine 0.7192235
mettocit 0.6620016
citrolline 0.6595340
glutamine 0.5745504
lysine 0.5729731
cittophe 0.5381576
Xletotyr 0.5116598
XletoAla 0.5056345
methionine 0.4855840
orntocit 0.4773288
mettotyr 0.4651458
serine 0.4605557
isoleucine 0.4260063
glutocit 0.4060991
argenine 0.3936690
phenylalanine 0.3893310
aspartate 0.3767340
argtophe 0.3764566
argtoala 0.3727233
argtoorn 0.3607005
proline 0.3494384
valine 0.3366872
mettoxle 0.3343400
alanine 0.3340815
ornitine 0.3231181
histithine 0.3204424
Xletophe 0.3077756
theronine 0.3023159
valtophe 0.3022618
cittoarg 0.2876059
tryptophane 0.2801585
phetotyr 0.2754304
tyrtocys 0.2602275
mettophe 0.2601541
glycine 0.2575445
alatocit 0.2254761
cysteine 0.2218226

predict with glm: recodnonseg2

most_important_features = importance_nonSeg2 %>% filter(MeanDecreaseGini>0.8)
data=  total_noSeg[ , which(names(total_noSeg) %in% most_important_features$name)]

data$recodnonseg2 = as.numeric(total_noSeg$recodnonseg2)

#when we add the below factors, AUC changes from 0.79 to 0.89
data$familyhistory = as.factor(total_noSeg$familyhistory)
data$recoddiseas2 = as.factor(total_noSeg$recoddiseas2)
data$newlesion = as.factor(total_noSeg$newlesion)
data = na.omit(data)
model = glm( data=data, recodnonseg2~., family = binomial(link = 'logit'))


print(summary(model))
## 
## Call:
## glm(formula = recodnonseg2 ~ ., family = binomial(link = "logit"), 
##     data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.3174  -0.7784  -0.0556   0.5432   2.1714  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     4.31156    2.21217   1.949   0.0513 .
## glutamate      -0.12552    0.05747  -2.184   0.0290 *
## tyrosine        0.02115    0.04699   0.450   0.6526  
## leucine        -0.04690    0.04031  -1.163   0.2447  
## familyhistory1  1.99901    1.16842   1.711   0.0871 .
## recoddiseas22   1.57539    1.13865   1.384   0.1665  
## newlesion1     -0.92081    1.02231  -0.901   0.3677  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 51.266  on 36  degrees of freedom
## Residual deviance: 30.808  on 30  degrees of freedom
## AIC: 44.808
## 
## Number of Fisher Scoring iterations: 5
predict_data = data.frame(true=total_noSeg[!is.na(total_noSeg$recodnonseg2), ]$recodnonseg2 ,predicted = ifelse(fitted(model) > 0.5, '1', '0'), m = fitted(model))
predict_data$res =  predict_data$predicted==predict_data$true
(sum(predict_data$res)/ nrow(predict_data))*100
## [1] 78.37838

RocPlot for glm model

basicplot <- ggplot(predict_data, aes(d = true, m = m)) + geom_roc(n.cuts = 0, labels = T)
basicplot+ annotate("text", x = .75, y = .25, label = paste("AUC =", round(calc_auc(basicplot)$AUC, 2)))